in this document I create a new version for the flowcam classifier at 18 degrees and 30% light in early Feb 2022. Crypto is only kept at lower light and some classes (airbubbles, debris, etc.) have not been remearured.

0.1 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)

set.seed(1)

0.2 Data

0.2.1 new data

colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

Chlamydomonas <- read_csv("Data/Chlamydomonas_light_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("Data/Cosmarium_light_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("Data/Desmodesmus_light_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("Data/Monoraphidium_light_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("Data/Staurastrum1_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("Data/Staurastrum2_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

Cryptomonas <- read_csv("../../Class_18C_decreasingLight/Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2,
                      Cryptomonas) 

colnames(dd_all_feb22) <- colnames

0.2.2 old data and merge

dd_all <- read_csv("../1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE) %>%
  dplyr::filter(!(species %in% c("Chlamydomonas","Cosmarium","Desmodesmus",
                                 "Monoraphidium","Staurastrum1","Staurastrum2",
                                 "Cryptomonas")))

dd_all <- rbind(dd_all_feb22,dd_all) %>%
  dplyr::select(-Date)
table(dd_all$species)

0.3 filtering out outliers

dd_all$id <- 1:nrow(dd_all)
dd_all_long <- dd_all %>%
  dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
                Average_Green,Average_Red,Circle_Fit,Circularity,
                Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
                Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
                Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
                Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
                Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
                Symmetry,Transparency,Width, id) %>%
  pivot_longer(cols=-c(id,species), names_to="variable") %>%
  dplyr::group_by(variable,species) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))



outliers <- dd_all_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>6)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
## 
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                    15                   347                     9 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                    39                    11                    31 
##             Cosmarium           Cryptomonas                Debris 
##                    19                    33                    10 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                   174                    18                     5 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                    13                    18                    41 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                     1                   219                    37 
##          Staurastrum2           Tetrahymena 
##                     5                     9
nrow(outliers)/nrow(dd_all)
## [1] 0.04359515
dd_all_filtered <- dd_all %>%
  dplyr::filter(!is.element(id,outliers$id))

dd_all$removed_outliers <- F
dd_all_filtered$removed_outliers <- T

dd_all_comparison <- rbind(dd_all,dd_all_filtered)


dd_all_comparison %>% 
  ggplot(aes(log10(Area_ABD), col=removed_outliers))+
  geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
  geom_boxplot(outlier.alpha = 0.3) +
  theme_bw() +
  facet_wrap(~species, scales = "free_y")  +
  geom_vline(xintercept = 1, col="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dd_all_filtered %>%
  ggplot(aes(log10(Area_ABD)))+
  geom_density(aes(col=species))

0.4 Species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                                         "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

0.5 Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(62)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()

trainingdata <- dd_all[complete.cases(dd_all), ]


for(i in 1:length(compositions_list)){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
  }

  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)
  
  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

0.6 Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_normLight30_trained_earlyFeb22.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))